/* Copyright 2012 Tobias Marschall
 *
 * This file is part of CLEVER.
 *
 * CLEVER is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * CLEVER is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with CLEVER.  If not, see <http://www.gnu.org/licenses/>.
 */

#ifndef SEQUENCEUTILS_H_
#define SEQUENCEUTILS_H_

#include <vector>
#include <limits>
#include <iostream>
#include <iomanip>

/** Class containing some (static) methods with sequence analysis algorithms. */
class SequenceUtils {
private:
	inline static size_t translate(int a) {
		if (a<0) return (-2*a)-1;
		else return a*2;
	}
	template <typename RefType, typename QueryType>
	static void printTable(const std::vector<std::vector<int> >& table, std::ostream& out, RefType ref, QueryType query, size_t ref_start, bool backwards) {
		int rows = table.size() + table[table.size()-1].size() / 2;
		out << "Table:" << std::endl;
		out << "  ";
		for (int j=0; j<table.size(); ++j) {
			if (backwards) {
				out << " " << query[query.size()-1-j] << " ";
			} else {
				out << " " << query[j] << " ";
			}
		}
		out << std::endl;
		for (int i=0; i<rows; ++i) {
			if (backwards) {
				int k = ref_start - i;
				out << (k>=0?ref[k]:'-') << " ";
			} else {
				int k = ref_start + i;
				out << (k<ref.size()?ref[k]:'-') << " ";
			}
			for (int j=0; j<table.size(); ++j) {
				int k = translate(i-j);
				if (k<table[j].size()) {
					out << std::setw(2) << table[j][k] << " ";
				} else {
					out << " - ";
				}
			}
			out << std::endl;
		}
	}
public:
	/** Type for information on how far a match could be extended. */
	typedef struct extension_data_t {
		/** Part of the query that was used. */
		size_t query_length;
		/** End position of alignment on reference, including this position. */
		size_t ref_end_pos;
		/** Edit distance of query prefix and reference. */
		size_t errors;
		extension_data_t(size_t query_length, size_t ref_end_pos, size_t errors) : query_length(query_length), ref_end_pos(ref_end_pos), errors(errors) {}
	} extension_data_t;

	/** Computes a banded alignment between query and reference (starting from given position). A variable length
	 *  prefix of the query is used. The match is extended until the query used completely or the error rate
	 *  (errors / query prefix length) would exceed the given limit by further extending it.
	 *  A number of minAllowedErrors errors is always allowed, however, no matter how much of the query is used.
	 *  If backwards is true, the query is matched from right to left against the reference. */
	template <typename RefType, typename QueryType>
	static extension_data_t bandedExtension(RefType ref, QueryType query, size_t ref_start, size_t minAllowedErrors, double maxErrorRate, bool backwards = false) {
		typedef int cost_t;
		assert((size_t)std::numeric_limits<cost_t>::max() >= query.size());
		assert(ref_start < ref.size());
		if (query.size() == 0) {
			return extension_data_t(0, ref_start, 0);
		}
		// dynamic programming table containing the diagonal of the table (banded alignment)
		// table[i] contains the "band" for the query prefix of length i+1 such that
		// table[i] = {dp(i,i), dp(i,i-1), dp(i,i+1), dp(i,i-2), ... } where dp would be a "normal"
		// rectangular dynamic programming table
		std::vector<std::vector<cost_t> > table;
		table.push_back(std::vector<cost_t>(3,0));
		table[0][0] = (query[backwards?(query.size()-1):0] == ref[ref_start])?0:1;
		table[0][1] = 2;
		table[0][2] = 1;
		int good_prefix_length = 0;
		int good_prefix_errors = 0;
		int good_ref_end_pos = ref_start;
		int i = 1;
		for (; i<(int)query.size(); ++i) {
			// compute number of allowed errors for prefix query[0..i]
			int e = std::max(minAllowedErrors, (size_t)((i+1)*maxErrorRate));
			table.push_back(std::vector<cost_t>(1+2*e, i+2));
			// current column
			std::vector<cost_t>& current = table[i];
			// column left of the current one
			const std::vector<cost_t>& left = table[i-1];
			cost_t best_score = i+2;
			int min_j = std::max(-e,-i);
			int max_j;
			if (backwards) {
				max_j = std::min(e,((int)ref_start)-i);
			} else {
				max_j = std::min(e,((int)ref.size())-((int)ref_start)-i-1);
			}
			for (int j=min_j; j<=max_j; ++j) {
				cost_t value = i+2;
				size_t j_t = translate(j);
				// cell above
				size_t k = translate(j-1);
				if (k<current.size()) value = std::min(value, current[k]+1);
				// cell left
				k = translate(j+1);
				if (k<left.size()) value = std::min(value, left[k]+1);
				// cell up left
				if (i+j>0) {
					bool match;
					if (backwards) {
						match = query[query.size()-1-i] == ref[ref_start-i-j];
					} else {
						match = query[i] == ref[ref_start+i+j];
					}
					if (j_t<left.size()) value = std::min(value, left[j_t] + (match?0:1));
					if (match && (value<=e) && ((good_prefix_length<i+1) || (value<good_prefix_errors))) {
						good_prefix_length = i+1;
						good_prefix_errors = value;
						if (backwards) {
							good_ref_end_pos = ref_start-i-j;
						} else {
							good_ref_end_pos = ref_start+i+j;
						}
					}
				}
				current[j_t] = value;
				best_score = std::min(best_score, value);
			}
			if (best_score > e) {
				// printTable(table, std::cout, ref, query, ref_start, backwards);
				return extension_data_t(good_prefix_length, good_ref_end_pos, good_prefix_errors);
			}
		}
		// printTable(table, std::cout, ref, query, ref_start, backwards);
		return extension_data_t(good_prefix_length, good_ref_end_pos, good_prefix_errors);
	}

};

#endif /* SEQUENCEUTILS_H_ */
